1 Getting ready

1.1 Load packages

library(here) # A Simpler Way to Find Your Files, CRAN v1.0.1
library(tidyverse) # Easily Install and Load the 'Tidyverse', CRAN v1.3.0 
library(qiime2R) # Import qiime2 artifacts to R, [github::jbisanz/qiime2R] v0.99.35  
library(phyloseq) # Handling and analysis of high-throughput microbiome census data, Bioconductor v1.34.0 
library(microbiome) # Microbiome Analytics, Bioconductor v1.12.0
library(picante) # Integrating Phylogenies and Ecology, CRAN v1.8.2 
library(MicrobeR) # Handy functions for microbiome analysis in R, [github::jbisanz/MicrobeR] v0.3.2 
library(mixOmics) # Omics Data Integration, Bioconductor v6.14.0 
library(RUVSeq) # Remove Unwanted Variation from RNA-Seq Data, Bioconductor v1.24.0
library(sva) # Surrogate Variable Analysis, Bioconductor v3.38.0 
library(cowplot) # Streamlined Plot Theme and Plot Annotations for 'ggplot2', CRAN v1.1.1  
library(ggpubr) # 'ggplot2' Based Publication Ready Plots, CRAN v0.4.0
library(ggh4x) # Hacks for 'ggplot2', CRAN v0.1.2.1 
library(plotly) # Create Interactive Web Graphics via 'plotly.js', CRAN v4.9.3
library(RColorBrewer) # ColorBrewer Palettes, CRAN v1.1-2 
library(vegan) # Community Ecology Package, CRAN v2.5-7 

1.2 Import data

# metadata
mtd <- read_tsv(here("data/metadata.tsv"), comment = "#q2") %>%
  rename(SampleID = "#SampleID") %>% 
  column_to_rownames("SampleID") %>% 
  mutate(PCRBatch   = factor(PCRBatch),
         Diet = factor(Diet, levels = c("REF", "IM")),
         Compartment = factor(Compartment, levels = c("PI", "DI")),
         SampleOrigin = factor(
           SampleOrigin, 
           levels = c("Feed", "Water", "Digesta", "Mucosa",
                      "Mock", "DNA-extraction", "Amplicon-PCR")),
         SampleType = factor(
           SampleType, 
           levels = c("REF-PID", "REF-PIM", "REF-DID", "REF-DIM", 
                      "IM-PID", "IM-PIM", "IM-DID", "IM-DIM", 
                      "Feed", "Water", "Extraction-blank", "PCR-blank", "Mock"))) 

# feature table
otu <- read_qza(here("data/intermediate/qiime2/97otu/table-filtered-97otu-sepp-inserted.qza")) %>%
  pluck("data") %>% 
  as("matrix")

# taxonomy
txnm <- read_qza(here("data/intermediate/qiime2/asv/taxonomy-silva132.qza"))
txnm <- txnm$data %>% 
  as.data.frame() %>%
  mutate(Taxon = gsub("D_0", "k", Taxon), Taxon = gsub("D_1", "p", Taxon),
         Taxon = gsub("D_2", "c", Taxon), Taxon = gsub("D_3", "o", Taxon),
         Taxon = gsub("D_4", "f", Taxon), Taxon = gsub("D_5", "g", Taxon),
         Taxon = gsub("D_6", "s", Taxon)) %>%
  separate(Taxon, sep = ";", c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")) %>% 
  column_to_rownames("Feature.ID") %>%
  select(-Confidence)

# phylogeny
tree <- read_qza(here("data/intermediate/qiime2/97otu/insertion-tree-97otu.qza"))

# assemble a phyloseq object
ps <- phyloseq(sample_data(mtd), 
               otu_table(otu, taxa_are_rows = TRUE),
               tax_table(as.matrix(txnm)),
               phy_tree(tree$data))

1.3 Data preprocessing

# change OTU names
indx <- formatC(1:ntaxa(ps), width = nchar(ntaxa(ps)), format = "d", flag = "0")
taxa_names(ps) <- paste0("OTU", indx)

# filter data
ps <- subset_samples(ps, !SampleType %in% c("Extraction-blank", "PCR-blank", "Mock")) # remove control samples

# extract metadata, otu table, taxonomy and phylogeny from phyloseq
mtd <- data.frame(sample_data(ps), check.names = FALSE)
otu <- as.data.frame(otu_table(ps)) %>% t()
txnm <- tax_table(ps) %>% as("matrix") %>% as.data.frame()
tree <- phy_tree(ps)

2 Assess batch effect

# Centered log-ratio transformation
otu_clr <- logratio.transfo(otu, logratio = "CLR", offset = 1 ) %>% as("matrix")

# PCA
pca_before <- pca(otu_clr, ncomp = 3)

# PCA plot
pcaPlot_before <- cbind(mtd, pca_before$variates$X) %>%
  ggplot(aes(x = PC1, y = PC2, color = SampleType, shape = PCRBatch)) +
  geom_point(size = 2) +
  geom_line(aes(group = SampleName), color = "black") + # connect paired technical replicates by line
  labs(title = "Before batch correction", color = "Sample type", shape = "PCR batch", 
       x = paste0("PC1: ", round(100 * pca_before$explained_variance[1], 2), "%"),
       y = paste0("PC2: ", round(100 * pca_before$explained_variance[2], 2), "%")) +
  scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
  theme_bw() 

pcaPlot_before 

From the PCA plot, we can tell that technical replicate samples are far apart. Mucosa samples from fish fed the REF diet form 2 clusters based on PCR batches. There’s a strong PCR batch effect.

3 Correct batch effect

3.1 RUVSeq

RUVSeq removes unwanted variation in RNASeq data using replicate samples and negative control genes. Here we use it for correcting batch effects in microbiome data. The two main parameters of RUVSeq are the number of factors of unwanted variation, k, and the set of negative control variables genes, or OTUs in our case. The choice of the parameter k is not easy and is data-dependent. Empirically, a few (2-3) factors are enough to capture the unwanted variation. In noisy data, K can be increased to 5 or 10. Note that if k is too high, the RUVSeq over-corrects for unwanted variation and ends up removing (almost) all the biological variability within the conditions. The choice of negative control variables is also somewhat data-dependent. However, RUVSeq is robust to the choice of negative control variables. One is recommend to perform extensive exploratory data analysis, comparing different values of k and sets of negative control variables.

# calculate coefficient of variation (cv) for each otu
cv <- t(otu) %>%
  as.data.frame() %>%
  rownames_to_column("featureID") %>%
  mutate(sd   = apply(.[, names(.) != "featureID"], 1, sd),
         mean = apply(.[, names(.) != "featureID"], 1, mean),
         cv   = sd / mean) %>%
  arrange(cv)

# proportions of otus used as negative controls
prp <- c(0.1, 0.2, 0.5, 1) 

# number of unwanted variation factors 
k <- c(1, 2, 3, 4, 5, 10) 

# get combinations of k and prp for parameter tuning
prmt <- expand.grid(prp = prp, k = k)

# sample replicate 
SampleType <- mtd$SampleType
names(SampleType) <- rownames(mtd)
rpl <- makeGroups(SampleType)

# batch effect correction
ruv <- map2(
  prmt$prp,
  prmt$k,
  function(x, y) 
  {
    # get otus showing least variance at defined proportions
    min_cv <- cv[1:round(nrow(cv) * x), ] %>%
      mutate(featureID = paste0("^", featureID, "$")) # exact math of otu ids
   
    # subset otus used as negative controls
    nc <- grepl(paste(min_cv$featureID, collapse = "|"), colnames(otu))
    
    # run RUVs
    ruv <- RUVs(t(otu), nc, y, rpl)
  }
)

# assign names to list elements
names(ruv) <- paste0("ruv_k", prmt$k, "_nc", prmt$prp)

3.2 ComBat-Seq

ComBat-seq is a batch effect correction method for RNASeq data. It is an improved model based on the popular effect correction tool, ComBat.ComBat-seq takes raw count matrix as input. Same as ComBat, it requires a known batch variable.

# PCR batch
PCRBatch <- mtd$PCRBatch
names(PCRBatch) <- rownames(mtd)

# model matrix
mod_mtrx <- model.matrix( ~ SampleType)

# run CombatSeq
cmbt <- ComBat_seq(t(otu), batch = PCRBatch, covar_mod = mod_mtrx) 
## Found 3 batches
## Using null model in ComBat-seq.
## Adjusting for 9 covariate(s) or covariate level(s)
## Estimating dispersions
## Fitting the GLM model
## Shrinkage off - using GLM estimates for parameters
## Adjusting the data

4 Evaluate methods

4.1 Visual inspection: PCA plot

4.1.1 RUVSeq

Extract and log ratio transform batch effect corrected OTU table.

ruv_clr <- purrr::map(ruv, ~pluck(.x, "normalizedCounts")) %>% # get batch corrected count table
  purrr::map(~t(.x)) %>% # transpose normalized count table
  purrr::map(~logratio.transfo(.x, logratio = 'CLR', offset = 1)) %>% # CLR transformation
  purrr::map(~as(.x, "matrix")) 

Run PCA.

pca_ruv <- purrr::map(ruv_clr, ~pca(.x, ncomp = 3)) 

Make PCA plots.

The PCA plots with k = 4, 5 or 10 look promising. Let’s look at one of them in 3d dimensions.

# extract data
k4_nc1 <- cbind(mtd, pca_ruv$ruv_k4_nc1$variates$X)

# 3d PCA plot with plot_ly
plot_ly(
  x = k4_nc1[, "PC1"], y = k4_nc1[, "PC2"], z = k4_nc1[, "PC3"], 
  type = "scatter3d", mode = "markers", color = k4_nc1$SampleType, 
  colors = "Paired") %>% #brewer.pal(n = 12, name = "Paired")
  layout(scene = list(
    xaxis = list(title = paste0('PC1: ', round(pca_ruv$ruv_k4_nc1$explained_variance[1]*100, 2), "%")),
    yaxis = list(title = paste0('PC2: ', round(pca_ruv$ruv_k4_nc1$explained_variance[2]*100, 2), "%")),
    zaxis = list(title = paste0('PC3: ', round(pca_ruv$ruv_k4_nc1$explained_variance[3]*100, 2), "%"))
    )
  )

4.1.2 ComBat-Seq

# centered log-ratio transformation
cmbt_clr <- logratio.transfo(t(cmbt), logratio = 'CLR', offset = 1) %>% 
  as("matrix")

# PCA
pca_cmbt <- pca(cmbt_clr, ncomp = 3)

# make PCA plot
pcaPlot_cmbt <- cbind(mtd, pca_cmbt$variates$X) %>%
  ggplot(aes(x = PC1, y = PC2, color = SampleType, shape = PCRBatch)) +
  geom_point(size = 2) +
  geom_line(aes(group = SampleName), color = "black") +
  labs(title = "ComBat-Seq", 
       color = "Sample type", shape = "PCR batch", 
       x = paste0("PC1: ", round(100 * pca_cmbt$explained_variance[1], 2), "%"),
       y = paste0("PC2: ", round(100 * pca_cmbt$explained_variance[2], 2), "%")) +
  scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
  theme_bw() 

pcaPlot_cmbt

4.2 Variance explained

4.2.1 Run pRDA

The partial redundancy analysis (pRDA) is a multivariate method to assess globally the effect of treatments and batch. Here, we run pRDA and get variance explained by treatments and batch.

# combine otu table before and after batch corrections in a list
otus <- c(before = list(otu_clr), ruv_clr, combatseq = list(cmbt_clr)) 

# merge otu tables and nest by batch correction methods 
otus_nst <- purrr::map(otus, as.data.frame) %>%
  bind_rows(.id = "batch_correction") %>%
  group_nest(batch_correction, .key = "data")

# define experiment design
design <- data.frame(SampleType = rep(SampleType, length(otus)), 
                     PCRBatch = rep(PCRBatch, length(otus)), 
                     batch_correction = rep(names(otus), each = nrow(mtd)))

design_nst <- group_nest(design, batch_correction, .key = "design")

# run pRDA and extract variance explained by treatment and batch effects
rda <- inner_join(otus_nst, design_nst, by = "batch_correction") %>%
  mutate(rda_trt = map2(data, design, ~rda(.x ~ PCRBatch + Condition(SampleType), data = .y)), 
         rda_bat = map2(data, design, ~rda(.x ~ SampleType + Condition(PCRBatch), data = .y)),
         var_trt = map_dbl(rda_trt, ~.x$pCCA$tot.chi*100/.x$tot.chi),
         var_bat = map_dbl(rda_bat, ~.x$pCCA$tot.chi*100/.x$tot.chi),
         batch_correction = factor(batch_correction, levels = names(otus)))  %>%
  select(batch_correction, var_trt, var_bat)  %>%
  rename(Method = batch_correction, Treatment = var_trt, Batch = var_bat)  %>%
  pivot_longer(Treatment:Batch, names_to = "Type", values_to = "var_expl")  %>%
  mutate(var_expl = round(var_expl, 2))

4.2.2 Plotting

Make bar plots showing variance explained by treatments and batch.

var_expl <- ggplot(rda, aes(x = fct_rev(Method), y = var_expl, fill = Type)) + 
  geom_bar(stat = "identity", position = 'dodge', colour = 'black') + 
  geom_text(aes(Method, var_expl + 5, label = var_expl), size = 3,
            position = position_dodge(width = 0.9)) + 
  coord_flip() +
  scale_y_continuous(limits = c(0, 100), expand = expansion(mult = c(0, 0))) +
  labs(x = "", y = "Variance explained (%)") + 
  theme_bw(base_size = 12) +
  guides(fill = guide_legend(reverse = TRUE)) 

var_expl

4.3 Summary

Based on the PCA plots and partial redundancy analysis (pRDA), we can conclude that the RUVSeq provides better batch effect corrections than the Combat-Seq does. In agreement with previous data, the RUVSeq is robust to the choice of negative control variables. Increasing the number of factors of unwanted variation, k, removes more unwanted variation. Setting k = 4, 5 or 10 yields very similar results. To avoid over-correcting batch effects, we choose the smallest K value, ie., k = 4 and nc = 1.

5 Validate results

Get batch effect corrected OTU table and replace the original OTU table in the phyloseq object.

# extract batch effect corrected OTU table
otu_adj <- ruv$ruv_k4_nc1$normalizedCounts

# replace otu table in the phyloseq object
otu_table(ps) <- otu_table(otu_adj, taxa_are_rows = TRUE)

5.1 Taxonomy

Make a table containing top 10 most abundant taxa at genus level.

taxa_tab <- Summarize.Taxa(otu_adj, txnm)$Genus %>% Make.Percent() 
taxa_tab <- taxa_tab[order(rowMeans(taxa_tab), decreasing = T), ]
Others <- colSums(taxa_tab[11:nrow(taxa_tab), ])
taxa_tab <- rbind(taxa_tab[1:10, ], Others)

Tidy dataframe for making stacked bar plots.

taxa_tab <- as.data.frame(taxa_tab) %>%
  rownames_to_column("Taxa") %>%
  separate(Taxa, sep = ";", c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus")) %>% 
  mutate(Phylum = ifelse(is.na(Phylum)|Phylum == "NA"|grepl("uncultured|Ambiguous|metagenome", Phylum), 
                         Kingdom, 
                         Phylum),
         Class = ifelse(is.na(Class)|Class == "NA"|grepl("uncultured|Ambiguous|metagenome", Class), 
                        Phylum, 
                        Class),
         Order = ifelse(is.na(Order)|Order == "NA"|grepl("uncultured|Ambiguous|metagenome", Order), 
                        Class, 
                        Order),
         Family = ifelse(is.na(Family)|Family == "NA"|grepl("uncultured|Ambiguous|metagenome", Family), 
                         Order, 
                         Family),
         Genus = ifelse(is.na(Genus)|Genus == "NA"|grepl("uncultured|Ambiguous|metagenome", Genus), 
                        Family, 
                        Genus)) %>%
  select(-Kingdom, -(Class:Family)) %>%
  pivot_longer(-c(Phylum, Genus), names_to = "SampleID", values_to = "Abundance") %>%
  mutate(Phylum = gsub("p__", "", Phylum),
         Genus = gsub("g__", "", Genus),
         Genus = factor(Genus, levels = rev(unique(Genus)))) %>%
  inner_join(rownames_to_column(mtd, "SampleID"), by = "SampleID")

Make stacked bar plots.

# define color scheme
col <- c("grey", brewer.pal(n = 10, name = "Paired"))
    
# water samples
taxa_bar_water <- filter(taxa_tab, SampleType == "Water") %>%
  ggplot(aes(x = SampleID, y = Abundance, fill = Genus)) +
  geom_bar(stat = "identity") +
  labs(x = "", y = "") +
  scale_y_continuous(breaks = 0:10*10, expand = c(0,0)) + 
  scale_fill_manual(values = col) +
  guides(fill=guide_legend(ncol=1)) +
  facet_wrap(~ SampleOrigin) +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        strip.background = element_blank(),
        legend.margin = margin(0, 0, 0, 0),
        legend.box.margin = margin(5, -10, 5, -8)) 

# feed samples
taxa_bar_feed <- filter(taxa_tab, SampleType == "Feed") %>%
  ggplot(aes(x = Diet, y = Abundance, fill = Genus)) +
  geom_bar(stat = "identity", width = 0.5) +
  labs(x = "Sample", y = "") +
  scale_y_continuous(breaks = 0:10*10, expand = c(0,0)) + 
  scale_fill_manual(values = col) +
  facet_nested(~ SampleOrigin + Diet, scales = "free_x", nest_line = TRUE) +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        strip.background = element_blank(),
        legend.position = "none") 

# samples from fish fed the REF diet 
taxa_bar_ref <- filter(taxa_tab, !(IsTechnicalReplicate == "yes" & PCRBatch == 2)) %>%
  filter(grepl("REF-", SampleType)) %>%
  ggplot(aes(x = SampleID, y = Abundance, fill = Genus)) +
  geom_bar(stat = "identity") +
  labs(x = "", y = "Relative abundance (%)") +
  scale_y_continuous(breaks = 0:10*10, expand = c(0,0)) + 
  scale_fill_manual(values = col) +
  facet_nested(~ Diet + SampleOrigin + Compartment, scales = "free_x", nest_line = TRUE) +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        strip.background = element_blank(),
        legend.position = "none") 

# samples from fish fed the IM diet 
taxa_bar_im <- filter(taxa_tab, !(IsTechnicalReplicate == "yes" & PCRBatch == 2)) %>%
  filter(grepl("IM-", SampleType)) %>%
  ggplot(aes(x = SampleID, y = Abundance, fill = Genus)) +
  geom_bar(stat = "identity") +
  labs(x = "", y = "") +
  scale_y_continuous(breaks = 0:10*10, expand = c(0,0)) + 
  scale_fill_manual(values = col) +
  facet_nested(~ Diet + SampleOrigin + Compartment, scales = "free_x", nest_line = TRUE) +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        strip.background = element_blank(),
        legend.position = "none") 

# assemble plots
plot_grid(
  taxa_bar_ref, 
  taxa_bar_feed + theme(plot.margin = margin(l = -0.5, unit = "cm")),
  taxa_bar_im + theme(plot.margin = margin(l = -0.4, unit = "cm")),
  taxa_bar_water + theme(plot.margin = margin(l = -0.3, unit = "cm")), 
  nrow =  1, align = 'h', axis = "tb", 
  rel_widths = c(6, 1.5, 6, 4.5)) 

Compared to the taxa barplot (data/intermediate/qiime2/97otu/taxa-bar-plots-filtered-97otu.qzv) before batch correction , the taxonomic composition of samples is very different after the batch correction by RUVSeq.

5.2 Alpha diversity

Compute alpha diversity indices.

# compute Observed features, Peilou's Evenness and Shannon's index
alph_npd <- alpha(ps, index = c("observed", "evenness_pielou", "diversity_shannon")) %>%
  rownames_to_column("SampleID")

# compute Faith's Phylogenetic Diversity
alph_pd <- pd(samp = t(otu_table(ps)), tree = phy_tree(ps), include.root = F) %>%
  select(PD) %>%
  rownames_to_column("SampleID")

# combine data
alph <- inner_join(alph_npd, alph_pd, by = "SampleID") %>%
  inner_join(rownames_to_column(mtd, "SampleID"), by = "SampleID") %>%
  rename("Observed OTUs" = observed, "Pielou's evenness" = evenness_pielou, 
         "Shannon's index" = diversity_shannon, "Faith's PD" = PD) %>%
  pivot_longer("Observed OTUs":"Faith's PD", names_to = "alph_indx", values_to = "value") %>%
  mutate(alph_indx = factor(alph_indx, levels = c("Observed OTUs", "Pielou's evenness", 
                                                  "Shannon's index", "Faith's PD")))

Alpha diversity of technical replicates.

filter(alph, IsTechnicalReplicate == "yes") %>%
  ggplot(aes(x = PCRBatch, y = value)) +
  geom_boxplot(aes(color = PCRBatch), width = 0.3) +
  geom_point(aes(color = PCRBatch)) +
  geom_line(aes(group = SampleName), linetype = "dashed") + 
  labs(x = "PCR batch", color = "PCR batch") +
  facet_wrap(~alph_indx, scales = "free_y") +
  scale_fill_brewer(palette = "Dark2") +
  theme_bw() 

Alpha diversity of REF-PIM and REF-DIM samples amplified in different PCR batches.

filter(alph, SampleType %in% c("REF-PIM", "REF-DIM")) %>%
  ggplot(aes(x = SampleType, y = value)) +
  geom_boxplot(width = 0.3) +
  geom_point(aes(color = PCRBatch)) +
  labs(x = "Sample type", color = "PCR batch") +
  facet_wrap(~ alph_indx, scales = "free_y") +
  scale_fill_brewer(palette = "Dark2") +
  theme_bw() 

In line with previous observations, unweighted alpha indices such as Observed OTUs and Faith’s PD are more easily influenced by technical variations than weighted alpha indices like Shannon’s index.

5.3 Beta diversity

OTU table summary.

sampleSum <- colSums(otu_adj) %>% sort()
sampleSum
## AqFl1-134 AqFl1-031 AqFl1-062 AqFl1-065 AqFl1-091 AqFl1-073 AqFl1-087 AqFl1-033 
##       873      1684      1782      1885      1888      1907      1941      1960 
## AqFl1-133 AqFl1-069 AqFl1-083 AqFl1-017 AqFl1-052 AqFl1-001 AqFl1-128 AqFl1-078 
##      1979      2164      2205      2210      2221      2259      2272      2287 
## AqFl1-089 AqFl1-024 AqFl1-012 AqFl1-020 AqFl1-061 AqFl1-126 AqFl1-044 AqFl1-124 
##      2321      2370      2452      2556      2558      2590      2665      2686 
## AqFl1-002 AqFl1-058 AqFl1-059 AqFl1-004 AqFl1-094 AqFl1-053 AqFl1-014 AqFl1-074 
##      2743      2776      2878      2945      2954      2987      2995      3098 
## AqFl1-129 AqFl1-038 AqFl1-049 AqFl1-035 AqFl1-077 AqFl1-047 AqFl1-009 AqFl1-025 
##      3140      3154      3157      3195      3271      3273      3332      3354 
## AqFl1-063 AqFl1-021 AqFl1-029 AqFl1-068 AqFl1-066 AqFl1-007 AqFl1-055 AqFl1-123 
##      3448      3480      3542      3591      3682      3694      3832      3908 
## AqFl1-041 AqFl1-125 AqFl1-045 AqFl1-057 AqFl1-084 AqFl1-027 AqFl1-015 AqFl1-032 
##      4040      4062      4087      4181      4201      4211      4251      4265 
## AqFl1-127 AqFl1-081 AqFl1-088 AqFl1-072 AqFl1-080 AqFl1-042 AqFl1-013 AqFl1-095 
##      4421      4423      4439      4443      4602      4605      4639      4645 
## AqFl1-022 AqFl1-043 AqFl1-096 AqFl1-054 AqFl1-056 AqFl1-076 AqFl1-040 AqFl1-003 
##      4694      4917      4926      4973      5100      5147      5337      5530 
## AqFl1-030 AqFl1-130 AqFl1-046 AqFl1-008 AqFl1-086 AqFl1-064 AqFl1-023 AqFl1-028 
##      5597      5622      5748      5849      5933      6292      6387      6621 
## AqFl1-079 AqFl1-034 AqFl1-060 AqFl1-092 AqFl1-011 AqFl1-010 AqFl1-016 AqFl1-070 
##      6750      6872      6926      7096      7354      7391      7429      7715 
## AqFl1-039 AqFl1-051 AqFl1-019 AqFl1-050 AqFl1-082 AqFl1-018 AqFl1-071 AqFl1-026 
##      7970      8849      8943      9125      9747      9874     10052     10225 
## AqFl1-006 AqFl1-093 AqFl1-115 AqFl1-090 AqFl1-037 AqFl1-036 AqFl1-075 AqFl1-118 
##     10926     11108     11298     11789     12187     12477     12660     13896 
## AqFl1-005 AqFl1-067 AqFl1-048 AqFl1-116 AqFl1-119 AqFl1-120 AqFl1-121 AqFl1-117 
##     14260     14401     16851     29419     31681     37869     38216     41612 
## AqFl1-122 
##     48178

We lost quite a lot of sequences after the batch effect correction. Here we rarefy OTU table at a sub-sampling depth of 1684 sequences.

ps_rf <- rarefy_even_depth(ps, sample.size = sampleSum[2], rngseed = 1910)

5.3.1 Jaccard distance

# pcoa analysis
ord_jac <- ordinate(ps_rf, method = "PCoA", distance = "jaccard")

# ordination plot
plot_ordination(ps_rf, ord_jac, color = "SampleType", shape = "PCRBatch") +
  scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
  geom_line(aes(group = SampleName), color = "black") +
  geom_point(size = 3) +
  labs(title = "PCoA of Jaccard distance after batch correcton by RUVSeq") +
  theme_bw() 

5.3.2 Bray-Curtis distance

# pcoa analysis
ord_bc <- ordinate(ps_rf, method = "PCoA", distance = "bray")

# ordination plot
plot_ordination(ps_rf, ord_bc, color = "SampleType", shape = "PCRBatch") +
  scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
  geom_line(aes(group = SampleName), color = "black") +
  geom_point(size = 3) +
  labs(title = "PCoA of Bray-Curtis distance after batch correcton by RUVSeq") +
  theme_bw() 

5.3.3 UniFrac distance

Unweighted UniFrac distance.

# pcoa analysis
ord_uwuf <- ordinate(ps_rf, method = "PCoA", distance = "unifrac", weighted = FALSE)

# ordination plot
plot_ordination(ps_rf, ord_uwuf, color = "SampleType", shape = "PCRBatch") +
  scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
  geom_line(aes(group = SampleName), color = "black") +
  geom_point(size = 3) +
  labs(title = "PCoA of unweighted UniFrac distance after batch correcton by RUVSeq") +
  theme_bw() 

Weighted UniFrac distance.

# pcoa analysis
ord_wuf <- ordinate(ps_rf, method = "PCoA", distance = "unifrac", weighted = TRUE)

# ordination plot
plot_ordination(ps_rf, ord_wuf, color = "SampleType", shape = "PCRBatch") +
  scale_color_manual(values = brewer.pal(n = 12, name = "Paired")[c(1:10, 12)]) +
  geom_line(aes(group = SampleName), color = "black") +
  geom_point(size = 3) +
  labs(title = "PCoA of weighted UniFrac distance after batch correcton by RUVSeq") +
  theme_bw() 

PcoA plots based on various distance metrics show that batch effect is partly removed after the adjustment by RUVSeq.

6 Conclusion

RUVSeq largely corrected PCR batch effects. However, the batch correction removed a large proportions of sequences in many samples and resulted in very different taxonomic compositions. Hence, we’ll not correct the PCR batch effect but rather analyze the data separately or account for batch effect in the statistical models where applicable.

7 Acknowledgements

I’d like to thank Wang and Lê Cao for publishing the code along with their paper “Managing batch effects in microbiome data”. Part of their code is used for the present analysis.

8 Session information

sessionInfo()
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## system code page: 65001
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-2          plotly_4.9.3               
##  [3] ggh4x_0.1.2.1               ggpubr_0.4.0               
##  [5] cowplot_1.1.1               sva_3.38.0                 
##  [7] genefilter_1.72.0           mgcv_1.8-35                
##  [9] RUVSeq_1.24.0               edgeR_3.32.0               
## [11] limma_3.46.0                EDASeq_2.24.0              
## [13] ShortRead_1.48.0            GenomicAlignments_1.26.0   
## [15] SummarizedExperiment_1.20.0 MatrixGenerics_1.2.0       
## [17] matrixStats_0.58.0          Rsamtools_2.6.0            
## [19] GenomicRanges_1.42.0        GenomeInfoDb_1.26.0        
## [21] Biostrings_2.58.0           XVector_0.30.0             
## [23] IRanges_2.24.0              S4Vectors_0.28.0           
## [25] BiocParallel_1.24.1         Biobase_2.50.0             
## [27] BiocGenerics_0.36.0         mixOmics_6.14.0            
## [29] MASS_7.3-53.1               MicrobeR_0.3.2             
## [31] picante_1.8.2               nlme_3.1-152               
## [33] vegan_2.5-7                 lattice_0.20-41            
## [35] permute_0.9-5               ape_5.5                    
## [37] microbiome_1.12.0           phyloseq_1.34.0            
## [39] qiime2R_0.99.35             forcats_0.5.1              
## [41] stringr_1.4.0               dplyr_1.0.5                
## [43] purrr_0.3.4                 readr_1.4.0                
## [45] tidyr_1.1.3                 tibble_3.1.1               
## [47] ggplot2_3.3.3               tidyverse_1.3.1            
## [49] here_1.0.1                  knitr_1.33                 
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.3         rtracklayer_1.50.0     R.methodsS3_1.8.1     
##   [4] bit64_4.0.5            aroma.light_3.20.0     DelayedArray_0.16.0   
##   [7] R.utils_2.10.1         data.table_1.14.0      rpart_4.1-15          
##  [10] hwriter_1.3.2          RCurl_1.98-1.3         generics_0.1.0        
##  [13] GenomicFeatures_1.42.0 RSQLite_2.2.7          bit_4.0.4             
##  [16] xml2_1.3.2             lubridate_1.7.10       assertthat_0.2.1      
##  [19] xfun_0.22              hms_1.0.0              jquerylib_0.1.4       
##  [22] evaluate_0.14          fansi_0.4.2            progress_1.2.2        
##  [25] dbplyr_2.1.1           readxl_1.3.1           igraph_1.2.6          
##  [28] DBI_1.1.1              htmlwidgets_1.5.3      rARPACK_0.11-0        
##  [31] ellipsis_0.3.1         crosstalk_1.1.1        RSpectra_0.16-0       
##  [34] backports_1.2.1        annotate_1.68.0        biomaRt_2.46.0        
##  [37] vctrs_0.3.6            abind_1.4-5            cachem_1.0.4          
##  [40] withr_2.4.1            checkmate_2.0.0        treeio_1.14.0         
##  [43] prettyunits_1.1.1      cluster_2.1.1          lazyeval_0.2.2        
##  [46] crayon_1.4.1           ellipse_0.4.2          pkgconfig_2.0.3       
##  [49] zCompositions_1.3.4    labeling_0.4.2         nnet_7.3-15           
##  [52] rlang_0.4.10           lifecycle_1.0.0        BiocFileCache_1.14.0  
##  [55] modelr_0.1.8           cellranger_1.1.0       rprojroot_2.0.2       
##  [58] Matrix_1.3-2           aplot_0.0.6            phangorn_2.6.3        
##  [61] carData_3.0-4          Rhdf5lib_1.12.0        reprex_2.0.0          
##  [64] base64enc_0.1-3        png_0.1-7              viridisLite_0.3.0     
##  [67] bitops_1.0-7           R.oo_1.24.0            rhdf5filters_1.2.0    
##  [70] blob_1.2.1             jpeg_0.1-8.1           rstatix_0.7.0         
##  [73] DECIPHER_2.18.1        ggsignif_0.6.1         rtk_0.2.6.1           
##  [76] scales_1.1.1           memoise_2.0.0          magrittr_2.0.1        
##  [79] plyr_1.8.6             zlibbioc_1.36.0        compiler_4.0.5        
##  [82] philr_1.16.0           cli_2.5.0              ade4_1.7-16           
##  [85] patchwork_1.1.1        htmlTable_2.1.0        Formula_1.2-4         
##  [88] tidyselect_1.1.0       stringi_1.5.3          highr_0.9             
##  [91] yaml_2.2.1             askpass_1.1            locfit_1.5-9.4        
##  [94] latticeExtra_0.6-29    ggrepel_0.9.1          grid_4.0.5            
##  [97] sass_0.3.1             fastmatch_1.1-0        tools_4.0.5           
## [100] rio_0.5.26             rstudioapi_0.13        foreach_1.5.1         
## [103] foreign_0.8-81         gridExtra_2.3          farver_2.1.0          
## [106] Rtsne_0.15             digest_0.6.27          rvcheck_0.1.8         
## [109] BiocManager_1.30.12    quadprog_1.5-8         Rcpp_1.0.6            
## [112] car_3.0-10             broom_0.7.6            httr_1.4.2            
## [115] AnnotationDbi_1.52.0   colorspace_2.0-0       rvest_1.0.0           
## [118] XML_3.99-0.5           fs_1.5.0               truncnorm_1.0-8       
## [121] splines_4.0.5          tidytree_0.3.3         multtest_2.46.0       
## [124] xtable_1.8-4           jsonlite_1.7.2         ggtree_2.4.0          
## [127] corpcor_1.6.9          R6_2.5.0               Hmisc_4.5-0           
## [130] NADA_1.6-1.1           pillar_1.6.0           htmltools_0.5.1.1     
## [133] glue_1.4.2             fastmap_1.1.0          DT_0.18               
## [136] codetools_0.2-18       utf8_1.2.1             bslib_0.2.4           
## [139] curl_4.3               zip_2.1.1              openxlsx_4.2.3        
## [142] openssl_1.4.3          survival_3.2-11        rmarkdown_2.7         
## [145] biomformat_1.18.0      munsell_0.5.0          rhdf5_2.34.0          
## [148] GenomeInfoDbData_1.2.4 iterators_1.0.13       haven_2.4.1           
## [151] reshape2_1.4.4         gtable_0.3.0